A house value is simply more than location and square footage. Like the features that make up a person, an educated party would want to know all aspects that give a house its value. For example, you want to sell a house and you don’t know the price which you can take — it can’t be too low or too high. To find house price you usually try to find similar properties in your neighbourhood and based on gathered data you will try to assess your house price. The objective is to analyse the various paramters and predict house prices.
# Import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import ensemble
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.metrics import classification_report, confusion_matrix
house= pd.read_csv("innercity.csv") #Loading Data File
house.head()
def details(df):
b = pd.DataFrame()
b['Null Values'] = df.isnull().sum() #Finding Null Values
b['Data Type'] = df.dtypes
b['No. of Unique Values'] = df.nunique()# Finding Unique Values
return b
details(house)
house.columns
house.describe().T
house.shape
house.corr()['price']
# Exploratory Analysis
# No. of Bedrooms
house['room_bed'].value_counts().plot(kind='bar')
plt.title('No. of Bedrooms')
plt.xlabel('Bedrooms')
plt.ylabel('Count')
sns.despine
# Location of house
plt.figure(figsize=(10,10))
sns.jointplot(x=house.lat.values, y=house.long.values, size=10)
plt.ylabel('Longitude', fontsize=12)
plt.xlabel('Latitude', fontsize=12)
plt.show()
sns.despine
plt.figure(figsize=(25,20))
sns.heatmap(house.corr(),annot=True)
plt.show()
house.hist(figsize=(15,20))
# Correlation among attributes
sns.pairplot(house.iloc[:,:])
house['dayhours']= [x.strip().replace('T000000','') for x in house.dayhours]#removing timestamp from dayhours column
house['dayhours'] = pd.to_datetime(house.dayhours)
house.head()
house['year_sold'] = house.dayhours.dt.year
house.head()
house_1 = house.drop(columns = 'dayhours')
house_1[house_1['room_bed'] == 33].index #finding maximum bed room row
house_1['room_bed']=house_1['room_bed'].astype('category')
house_1.drop(columns = 'cid',inplace=True)
house_1.head()
house_1.drop(index=750,inplace=True)
house_1.head()
house_1 = house_1.reset_index()
house_1.drop(columns='index',inplace=True)
house_1['room_bath']=house_1['room_bath'].astype('category')
house_1['ceil']=house_1['ceil'].astype('category')
house_1['coast']=house_1['coast'].astype('category')
house_1['sight']=house_1['sight'].astype('category')
house_1['condition']=house_1['condition'].astype('category')
house_1['quality']=house_1['quality'].astype('category')
house_1['have_basement'] = house_1['basement'].apply(lambda x: 0 if x==0 else 1)
house_1['Is_renovated'] = house_1['yr_renovated'].apply(lambda x: 0 if x==0 else 1)
house_1['Age_of_house'] = house_1['year_sold'] - house_1['yr_built']
house_1
lst = []
for i in house_1.lat:
if i<47.255900:
lst.append('ES')
elif i>47.255900 and i<47.405900:
lst.append('MS')
elif i>47.405900 and i<47.555900:
lst.append('MN')
else:
lst.append('EN')
house_1['SN_region'] = lst
house_1['SN_region'] = house_1['SN_region'].astype('category')
lst = []
for i in abs(house_1.long):
if i<122.105000:
lst.append('EE')
elif i>122.105000 and i<122.205000:
lst.append('ME')
elif i>122.205000 and i<122.328000:
lst.append('MW')
else:
lst.append('EW')
house_1['EW_region'] = lst
house_1['EW_region'] = house_1['EW_region'].astype('category')
delete_index = house_1[house_1['Age_of_house']== -1]['year_sold'].index
house_1.drop(index=delete_index,inplace = True)
house_1.head()
house.hist(figsize=(25,20))
plt.show()
list1 = ['dayhours','cid','price','room_bed', 'room_bath',
'ceil', 'coast', 'sight', 'condition', 'quality',
'furnished','zipcode']
house_def = house.drop(columns=list1)
def trend():
for i in house_def.columns:
house[[i,'price']].groupby([i]).sum().plot(figsize=(15,5))
plt.show()
trend()
list1 = ['dayhours','cid','price','room_bed', 'room_bath',
'ceil', 'coast', 'sight', 'condition', 'quality',
'furnished','zipcode','year_sold','yr_renovated','lat','long']
house_def = house.drop(columns=list1)
def scatterr():
for i in house_def.columns:
plt.figure(figsize=(15,5))
sns.regplot(x='price',y=i, data=house, color='#79d13e')
plt.show()
scatterr()
house_1.shape
house_1.columns
details(house_1)
data = house_1.copy()
data.head()
dummy_data = pd.get_dummies(data)
dummy_data.head()
for i in dummy_data.columns:
print(i)
drop_list = ['lat','long','zipcode','yr_built','year_sold','yr_renovated','basement','total_area',
'room_bed_33']
dummy_data.drop(columns=drop_list,inplace =True)
dummy_data.head()
dummy_data.corr()['price']
details(dummy_data)
x = dummy_data.drop(columns='price')
y = dummy_data.price
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 100)
import statsmodels.api as sm
X_train=sm.add_constant(x_train)# Add constant X
X_test = sm.add_constant(x_test)
ols_model=sm.OLS(y_train,X_train).fit()
residuals = ols_model.resid
ols_model.summary()
x = dummy_data[['SN_region_MS', 'living_measure', 'sight_4', 'SN_region_EN', 'furnished', 'quality_9', 'quality_10', 'Age_of_house', 'coast_1', 'quality_13', 'sight_0', 'quality_12', 'quality_8', 'room_bath_7.75', 'EW_region_EE', 'condition_5', 'Is_renovated', 'condition_4', 'room_bath_6.0', 'EW_region_MW', 'living_measure15', 'EW_region_EW', 'room_bath_4.75', 'room_bath_8.0', 'ceil_2.5', 'quality_7', 'room_bath_4.25', 'room_bath_3.25', 'room_bath_5.5', 'room_bath_3.75', 'room_bath_5.25', 'room_bath_5.0', 'room_bath_4.5', 'room_bath_4.0', 'quality_11', 'sight_2', 'lot_measure15', 'room_bath_3.5', 'room_bed_4', 'room_bed_6', 'ceil_1.0', 'room_bed_7', 'room_bath_5.75', 'room_bed_5', 'SN_region_ES', 'room_bath_3.0', 'lot_measure', 'room_bath_6.25', 'room_bath_2.25', 'ceil_measure', 'room_bath_6.75']]
x = x.drop(columns='ceil_1.0')
y = dummy_data.price
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 100)
import statsmodels.api as sm
X_train=sm.add_constant(x_train)# Add constant X
X_test = sm.add_constant(x_test)
ols_model=sm.OLS(y_train,X_train).fit()
residuals = ols_model.resid
ols_model.summary()
from sklearn.linear_model import LinearRegression
lm=LinearRegression()
lm.fit(x_train,y_train)
coefficients = pd.concat([pd.DataFrame(x_train.columns),pd.DataFrame(np.transpose(lm.coef_))], axis = 1)
print(coefficients)
print(lm.intercept_)
li_y_pred=lm.predict(x_test)
print(lm.score(x_test,y_test))
import numpy as np
y_test = pd.to_numeric(y_test, errors='coerce')
RSS = np.sum((li_y_pred - y_test)**2)
y_mean = np.mean(y_test)
TSS = np.sum((y_test - y_mean)**2)
R2 = 1 - RSS/TSS
print('R Squared',R2)
n=X_test.shape[0]
p=X_test.shape[1] - 1
adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R squared',adj_rsquared)
print(" Root Mean Squared Error: %.4f"
% np.sqrt(np.mean((li_y_pred - y_test) ** 2)))
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
x_train_scaled = ss.fit_transform(x_train)
x_test_scaled = ss.transform(x_test)
from sklearn.decomposition import PCA
#Fitting the PCA algorithm with our Data
pca = PCA()
model_pca = pca.fit(x_train_scaled)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(model_pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Popularity Dataset Explained Variance')
plt.show()
model_pca = PCA(n_components=46,svd_solver='full')
new_train = model_pca.fit_transform(x_train_scaled)
new_test = model_pca.transform(x_test_scaled)
import pandas as pd
import pylab as pl
from sklearn.decomposition import PCA
# Dump components relations with features:
pd.DataFrame(model_pca.components_,columns=x_train.columns,index = ['PC-1','PC-2','PC-3','PC-4','PC-5','PC-6','PC-7','PC-8','PC-9','PC-10',
'PC-11','PC-12','PC-13','PC-14','PC-15','PC-16','PC-17','PC-18','PC-19','PC-20',
'PC-21','PC-22','PC-23','PC-24','PC-25','PC-26','PC-27','PC-28','PC-29','PC-30',
'PC-31','PC-32','PC-33','PC-34','PC-35','PC-36','PC-37','PC-38','PC-39','PC-40',
'PC-41','PC-42','PC-43','PC-44','PC-45','PC-46'])
from sklearn.linear_model import LinearRegression
lm=LinearRegression()
lm.fit(new_train,y_train)
coefficients = pd.concat([pd.DataFrame(x_train.columns),pd.DataFrame(np.transpose(lm.coef_))], axis = 1)
print(coefficients)
print(lm.intercept_)
li_y_pred=lm.predict(new_test)
import numpy as np
y_test = pd.to_numeric(y_test, errors='coerce')
RSS = np.sum((li_y_pred - y_test)**2)
y_mean = np.mean(y_test)
TSS = np.sum((y_test - y_mean)**2)
R2 = 1 - RSS/TSS
print('R Squared',R2)
n=new_test.shape[0]
p=new_test.shape[1] - 1
adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R squared',adj_rsquared)
lin_rmse = np.sqrt(np.mean((li_y_pred - y_test) ** 2))
print(" Root Mean Squared Error: %.4f"
% np.sqrt(np.mean((li_y_pred - y_test) ** 2)))
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import Lasso
las = Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000,
tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')
las.fit(new_train,y_train)
las_predict_pca = las.predict(new_test)
# Accuracy Score on test dataset
las_rmse_test_pca = mean_squared_error(y_test,las_predict_pca)**(0.5)
print('\nRMSE on test dataset : ', las_rmse_test_pca)
las_r2 = r2_score(y_test,las_predict_pca)
print('R square on test dataset is %1.3f' %r2_score(y_test,las_predict_pca))
coefficients = pd.concat([pd.DataFrame(x_train.columns),pd.DataFrame(np.transpose(las.coef_))], axis = 1)
print(coefficients)
print(las.intercept_)
from sklearn.linear_model import Ridge
rid = Ridge(alpha=2.0, fit_intercept=True,normalize=False, copy_X=True, max_iter=None, tol=0.001,
solver='auto', random_state=None)
rid.fit(new_train,y_train)
rid_predict_pca = rid.predict(new_test)
# Accuracy Score on test dataset
rid_rmse_test_pca = mean_squared_error(y_test,rid_predict_pca)**(0.5)
print('\nRMSE on test dataset : ', rid_rmse_test_pca)
rid_r2 =r2_score(y_test,rid_predict_pca)
print('R square on test dataset is %1.3f' %r2_score(y_test,rid_predict_pca))
coefficients = pd.concat([pd.DataFrame(x_train.columns),pd.DataFrame(np.transpose(rid.coef_))], axis = 1)
print(coefficients)
print(rid.intercept_)
# Importing decision tree classifier from sklearn library
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error,r2_score
# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is 5 so that we can plot and read the tree.
dtree = DecisionTreeRegressor(max_depth=5)
dtree.fit(new_train,y_train)
# predict the target on the new train dataset
dtree_pca_train_pred = dtree.predict(new_train)
# predict the target on the new test dataset
dtree_predict_test_pca = dtree.predict(new_test)
# Accuracy Score on test dataset
dtree_rmse_test_pca = mean_squared_error(y_test,dtree_predict_test_pca)**(0.5)
print('\nRMSE on test dataset : ', dtree_rmse_test_pca)
dtree_r2 =r2_score(y_test,dtree_predict_test_pca)
print('R square on test dataset is %1.3f' %r2_score(y_test,dtree_predict_test_pca))
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=200, n_jobs=-1)
rf_reg.fit(new_train,y_train)
# predict the target on the new test dataset
rf_reg_predict_test_pca = rf_reg.predict(new_test)
# Accuracy Score on test dataset
rf_reg_rmse_test_pca = mean_squared_error(y_test,rf_reg_predict_test_pca)**(0.5)
print('RMSE on test dataset : ', rf_reg_rmse_test_pca)
rf_reg_r2 = r2_score(y_test,rf_reg_predict_test_pca)
print('R square on test dataset is %1.3f' % r2_score(y_test,rf_reg_predict_test_pca))
rf_reg.score(new_test, y_test)
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=6000,
learning_rate=0.01,
max_depth=4,
max_features='sqrt',
min_samples_leaf=15,
min_samples_split=10,
loss='huber',
random_state=100
)
gbr.fit(new_train,y_train)
# predict the target on the new test dataset
gbr_predict_test_pca = gbr.predict(new_test)
# Accuracy Score on test dataset
gbr_rmse_test_pca = mean_squared_error(y_test,gbr_predict_test_pca)**(0.5)
print('\nRMSE on new test dataset : ', gbr_rmse_test_pca)
gbr_r2 = r2_score(y_test, gbr_predict_test_pca)
print('R square is %1.3f' % r2_score(y_test, gbr_predict_test_pca))
# Light Gradient Boosting Regressor
from lightgbm import LGBMRegressor
lightgbm = LGBMRegressor(n_jobs=-1)
lightgbm.fit(new_train,y_train)
# predict the target on the new test dataset
lgbm_predict_test_pca = lightgbm.predict(new_test)
# Accuracy Score on test dataset
lgbm_rmse_test_pca = mean_squared_error(y_test,lgbm_predict_test_pca)**(0.5)
print('\nRMSE on new test dataset : ', lgbm_rmse_test_pca)
lgbm_r2 =r2_score(y_test, lgbm_predict_test_pca)
print('R square is %1.3f' % r2_score(y_test, lgbm_predict_test_pca))
# XGBoost Regressor
from xgboost import XGBRegressor
xgboost = XGBRegressor(learning_rate=0.01,
n_estimators=6000,
max_depth=4,
min_child_weight=0,
gamma=0.6,
subsample=0.7,
colsample_bytree=0.7,
objective='reg:linear',
nthread=-1,
scale_pos_weight=1,
seed=27,
reg_alpha=0.00006,
random_state=42,
n_jobs=-1)
xgboost.fit(new_train,y_train)
# predict the target on the new test dataset
xgb_predict_test_pca = xgboost.predict(new_test)
# Accuracy Score on test dataset
xgb_rmse_test_pca = mean_squared_error(y_test,xgb_predict_test_pca)**(0.5)
print('\nRMSE on new test dataset : ', xgb_rmse_test_pca)
xgb_r2 = r2_score(y_test, xgb_predict_test_pca)
print('R square is %1.3f' % r2_score(y_test, xgb_predict_test_pca))
# Run all
output = pd.DataFrame({'Regressors With PCA':['Linear Regression','LASSO','Ridge','Decision Tree Regressor','Random Forest Regressor','Gradient Boosting Regressor','Light GBM Regressor', 'XGB Regressor'],
'Root Mean Squared Error': [lin_rmse,las_rmse_test_pca,rid_rmse_test_pca,dtree_rmse_test_pca,rf_reg_rmse_test_pca,gbr_rmse_test_pca,lgbm_rmse_test_pca,xgb_rmse_test_pca],
'R2 Score':[R2,las_r2,rid_r2,dtree_r2,rf_reg_r2,gbr_r2,lgbm_r2,xgb_r2]
})
output